import pandas as pd
import numpy as np
import plotly.express as px
train_raw_df = pd.read_csv("../dataset/spaceship-titanic/train.csv")
test_raw_df = pd.read_csv("../dataset/spaceship-titanic/test.csv")
train_process_df = train_raw_df.copy()
def explore_data(column, info=True, chart=[], chart_column="Transported"):
column_data = train_raw_df[column]
if info:
print("------- Column Info: -------")
print(train_raw_df[column].info())
print("------- Data Counts: -------")
print(train_raw_df[column].value_counts())
print("------- Null Check: -------")
print(train_raw_df[column].isnull().sum())
print("------- Describe -------")
print(train_raw_df[column].describe())
if "bar" in chart:
print(f"------- Bar Plot {column} vs {chart_column}: -------")
fig = px.bar(train_raw_df, x=column, color=chart_column, barmode="group",
color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_traces(dict(marker_line_width=0))
fig.show()
if "pie" in chart:
print(f"------- Pie Chart {column} Count plot: -------")
values= train_raw_df[column].value_counts(dropna=False)
values_dict =values.to_dict()
fig = px.pie(values=list(values_dict.values()), names=list(values_dict.keys()),
color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_traces(dict(marker_line_width=0))
fig.show()
if "hist" in chart:
print(f"------- Hist Chart {column} : -------")
fig = px.histogram(x=train_raw_df[column], barmode="group",
color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_layout(bargap=0.1)
fig.update_traces(dict(marker_line_width=0))
fig.show()
if "box" in chart:
fig = px.box(train_raw_df, y=column, color_discrete_sequence=px.colors.qualitative.Antique)
fig.update_traces(dict(marker_line_width=0))
fig.show()
if info:
return column_data
def change_data(column, type_cast=dict()):
if type_cast:
train_process_df[column] = train_process_df[column].astype(type_cast["to"])
test_raw_df
| PassengerId | HomePlanet | CryoSleep | Cabin | Destination | Age | VIP | RoomService | FoodCourt | ShoppingMall | Spa | VRDeck | Name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0013_01 | Earth | True | G/3/S | TRAPPIST-1e | 27.0 | False | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Nelly Carsoning |
| 1 | 0018_01 | Earth | False | F/4/S | TRAPPIST-1e | 19.0 | False | 0.0 | 9.0 | 0.0 | 2823.0 | 0.0 | Lerome Peckers |
| 2 | 0019_01 | Europa | True | C/0/S | 55 Cancri e | 31.0 | False | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Sabih Unhearfus |
| 3 | 0021_01 | Europa | False | C/1/S | TRAPPIST-1e | 38.0 | False | 0.0 | 6652.0 | 0.0 | 181.0 | 585.0 | Meratz Caltilter |
| 4 | 0023_01 | Earth | False | F/5/S | TRAPPIST-1e | 20.0 | False | 10.0 | 0.0 | 635.0 | 0.0 | 0.0 | Brence Harperez |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 4272 | 9266_02 | Earth | True | G/1496/S | TRAPPIST-1e | 34.0 | False | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Jeron Peter |
| 4273 | 9269_01 | Earth | False | NaN | TRAPPIST-1e | 42.0 | False | 0.0 | 847.0 | 17.0 | 10.0 | 144.0 | Matty Scheron |
| 4274 | 9271_01 | Mars | True | D/296/P | 55 Cancri e | NaN | False | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Jayrin Pore |
| 4275 | 9273_01 | Europa | False | D/297/P | NaN | NaN | False | 0.0 | 2680.0 | 0.0 | 0.0 | 523.0 | Kitakan Conale |
| 4276 | 9277_01 | Earth | True | G/1498/S | PSO J318.5-22 | 43.0 | False | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Lilace Leonzaley |
4277 rows × 13 columns
train_raw_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8693 entries, 0 to 8692 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PassengerId 8693 non-null object 1 HomePlanet 8492 non-null object 2 CryoSleep 8476 non-null object 3 Cabin 8494 non-null object 4 Destination 8511 non-null object 5 Age 8514 non-null float64 6 VIP 8490 non-null object 7 RoomService 8512 non-null float64 8 FoodCourt 8510 non-null float64 9 ShoppingMall 8485 non-null float64 10 Spa 8510 non-null float64 11 VRDeck 8505 non-null float64 12 Name 8493 non-null object 13 Transported 8693 non-null bool dtypes: bool(1), float64(6), object(7) memory usage: 891.5+ KB
train_raw_df.head()
| PassengerId | HomePlanet | CryoSleep | Cabin | Destination | Age | VIP | RoomService | FoodCourt | ShoppingMall | Spa | VRDeck | Name | Transported | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0001_01 | Europa | False | B/0/P | TRAPPIST-1e | 39.0 | False | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Maham Ofracculy | False |
| 1 | 0002_01 | Earth | False | F/0/S | TRAPPIST-1e | 24.0 | False | 109.0 | 9.0 | 25.0 | 549.0 | 44.0 | Juanna Vines | True |
| 2 | 0003_01 | Europa | False | A/0/S | TRAPPIST-1e | 58.0 | True | 43.0 | 3576.0 | 0.0 | 6715.0 | 49.0 | Altark Susent | False |
| 3 | 0003_02 | Europa | False | A/0/S | TRAPPIST-1e | 33.0 | False | 0.0 | 1283.0 | 371.0 | 3329.0 | 193.0 | Solam Susent | False |
| 4 | 0004_01 | Earth | False | F/1/S | TRAPPIST-1e | 16.0 | False | 303.0 | 70.0 | 151.0 | 565.0 | 2.0 | Willy Santantines | True |
train_raw_df.Transported.value_counts()
True 4378 False 4315 Name: Transported, dtype: int64
explore_data("PassengerId", chart=["box"])
------- Column Info: -------
<class 'pandas.core.series.Series'>
RangeIndex: 8693 entries, 0 to 8692
Series name: PassengerId
Non-Null Count Dtype
-------------- -----
8693 non-null object
dtypes: object(1)
memory usage: 68.0+ KB
None
------- Data Counts: -------
0001_01 1
6136_01 1
6141_01 1
6139_06 1
6139_05 1
..
3126_01 1
3124_03 1
3124_02 1
3124_01 1
9280_02 1
Name: PassengerId, Length: 8693, dtype: int64
------- Null Check: -------
0
------- Describe -------
count 8693
unique 8693
top 0001_01
freq 1
Name: PassengerId, dtype: object
0 0001_01
1 0002_01
2 0003_01
3 0003_02
4 0004_01
...
8688 9276_01
8689 9278_01
8690 9279_01
8691 9280_01
8692 9280_02
Name: PassengerId, Length: 8693, dtype: object
train_raw_df.PassengerId.sample(10)
1632 1728_01 1753 1865_02 8450 9026_01 7673 8190_01 4375 4656_01 2516 2703_01 7563 8084_01 5910 6268_01 3808 4066_02 3796 4050_01 Name: PassengerId, dtype: object
train_raw_df.PassengerId.str.split("_", expand=True)
| 0 | 1 | |
|---|---|---|
| 0 | 0001 | 01 |
| 1 | 0002 | 01 |
| 2 | 0003 | 01 |
| 3 | 0003 | 02 |
| 4 | 0004 | 01 |
| ... | ... | ... |
| 8688 | 9276 | 01 |
| 8689 | 9278 | 01 |
| 8690 | 9279 | 01 |
| 8691 | 9280 | 01 |
| 8692 | 9280 | 02 |
8693 rows × 2 columns
train_raw_df.PassengerId.str.split("_", expand=True).iloc[:, 0].unique()
array(['0001', '0002', '0003', ..., '9278', '9279', '9280'], dtype=object)
len(train_raw_df.PassengerId.str.split("_", expand=True).iloc[:, 0].unique())
6217
explore_data("HomePlanet", chart=["pie", "bar"])
------- Column Info: ------- <class 'pandas.core.series.Series'> RangeIndex: 8693 entries, 0 to 8692 Series name: HomePlanet Non-Null Count Dtype -------------- ----- 8492 non-null object dtypes: object(1) memory usage: 68.0+ KB None ------- Data Counts: ------- Earth 4602 Europa 2131 Mars 1759 Name: HomePlanet, dtype: int64 ------- Null Check: ------- 201 ------- Describe ------- count 8492 unique 3 top Earth freq 4602 Name: HomePlanet, dtype: object ------- Bar Plot HomePlanet vs Transported: -------
------- Pie Chart HomePlanet Count plot: -------
0 Europa
1 Earth
2 Europa
3 Europa
4 Earth
...
8688 Europa
8689 Earth
8690 Earth
8691 Europa
8692 Europa
Name: HomePlanet, Length: 8693, dtype: object
explore_data("CryoSleep")
------- Column Info: ------- <class 'pandas.core.series.Series'> RangeIndex: 8693 entries, 0 to 8692 Series name: CryoSleep Non-Null Count Dtype -------------- ----- 8476 non-null object dtypes: object(1) memory usage: 68.0+ KB None ------- Data Counts: ------- False 5439 True 3037 Name: CryoSleep, dtype: int64 ------- Null Check: ------- 217 ------- Describe ------- count 8476 unique 2 top False freq 5439 Name: CryoSleep, dtype: object
0 False
1 False
2 False
3 False
4 False
...
8688 False
8689 True
8690 False
8691 False
8692 False
Name: CryoSleep, Length: 8693, dtype: object
explore_data("CryoSleep", info=False, chart=["bar"])
------- Bar Plot CryoSleep vs Transported: -------
explore_data("CryoSleep", info=False, chart=["bar"], chart_column="VIP")
------- Bar Plot CryoSleep vs VIP: -------
explore_data("Cabin")
------- Column Info: -------
<class 'pandas.core.series.Series'>
RangeIndex: 8693 entries, 0 to 8692
Series name: Cabin
Non-Null Count Dtype
-------------- -----
8494 non-null object
dtypes: object(1)
memory usage: 68.0+ KB
None
------- Data Counts: -------
G/734/S 8
G/109/P 7
B/201/P 7
G/1368/P 7
G/981/S 7
..
G/556/P 1
E/231/S 1
G/545/S 1
G/543/S 1
F/947/P 1
Name: Cabin, Length: 6560, dtype: int64
------- Null Check: -------
199
------- Describe -------
count 8494
unique 6560
top G/734/S
freq 8
Name: Cabin, dtype: object
0 B/0/P
1 F/0/S
2 A/0/S
3 A/0/S
4 F/1/S
...
8688 A/98/P
8689 G/1499/S
8690 G/1500/S
8691 E/608/S
8692 E/608/S
Name: Cabin, Length: 8693, dtype: object
train_process_df["deck"] = train_raw_df.Cabin.str.split("/", expand=True)[0]
train_process_df["Num"] = train_raw_df.Cabin.str.split("/", expand=True)[1]
train_process_df["Side"] = train_raw_df.Cabin.str.split("/", expand=True)[2]
train_process_df
| PassengerId | HomePlanet | CryoSleep | Cabin | Destination | Age | VIP | RoomService | FoodCourt | ShoppingMall | Spa | VRDeck | Name | Transported | deck | Num | Side | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0001_01 | Europa | False | B/0/P | TRAPPIST-1e | 39.0 | False | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Maham Ofracculy | False | B | 0 | P |
| 1 | 0002_01 | Earth | False | F/0/S | TRAPPIST-1e | 24.0 | False | 109.0 | 9.0 | 25.0 | 549.0 | 44.0 | Juanna Vines | True | F | 0 | S |
| 2 | 0003_01 | Europa | False | A/0/S | TRAPPIST-1e | 58.0 | True | 43.0 | 3576.0 | 0.0 | 6715.0 | 49.0 | Altark Susent | False | A | 0 | S |
| 3 | 0003_02 | Europa | False | A/0/S | TRAPPIST-1e | 33.0 | False | 0.0 | 1283.0 | 371.0 | 3329.0 | 193.0 | Solam Susent | False | A | 0 | S |
| 4 | 0004_01 | Earth | False | F/1/S | TRAPPIST-1e | 16.0 | False | 303.0 | 70.0 | 151.0 | 565.0 | 2.0 | Willy Santantines | True | F | 1 | S |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 8688 | 9276_01 | Europa | False | A/98/P | 55 Cancri e | 41.0 | True | 0.0 | 6819.0 | 0.0 | 1643.0 | 74.0 | Gravior Noxnuther | False | A | 98 | P |
| 8689 | 9278_01 | Earth | True | G/1499/S | PSO J318.5-22 | 18.0 | False | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Kurta Mondalley | False | G | 1499 | S |
| 8690 | 9279_01 | Earth | False | G/1500/S | TRAPPIST-1e | 26.0 | False | 0.0 | 0.0 | 1872.0 | 1.0 | 0.0 | Fayey Connon | True | G | 1500 | S |
| 8691 | 9280_01 | Europa | False | E/608/S | 55 Cancri e | 32.0 | False | 0.0 | 1049.0 | 0.0 | 353.0 | 3235.0 | Celeon Hontichre | False | E | 608 | S |
| 8692 | 9280_02 | Europa | False | E/608/S | TRAPPIST-1e | 44.0 | False | 126.0 | 4688.0 | 0.0 | 0.0 | 12.0 | Propsh Hontichre | True | E | 608 | S |
8693 rows × 17 columns
train_process_df["Num"] = train_process_df.Num.astype("Int64")
train_process_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8693 entries, 0 to 8692 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PassengerId 8693 non-null object 1 HomePlanet 8492 non-null object 2 CryoSleep 8476 non-null object 3 Cabin 8494 non-null object 4 Destination 8511 non-null object 5 Age 8514 non-null float64 6 VIP 8490 non-null object 7 RoomService 8512 non-null float64 8 FoodCourt 8510 non-null float64 9 ShoppingMall 8485 non-null float64 10 Spa 8510 non-null float64 11 VRDeck 8505 non-null float64 12 Name 8493 non-null object 13 Transported 8693 non-null bool 14 deck 8494 non-null object 15 Num 8494 non-null Int64 16 Side 8494 non-null object dtypes: Int64(1), bool(1), float64(6), object(9) memory usage: 1.1+ MB
explore_data("Destination", chart=["pie", "bar"])
------- Column Info: ------- <class 'pandas.core.series.Series'> RangeIndex: 8693 entries, 0 to 8692 Series name: Destination Non-Null Count Dtype -------------- ----- 8511 non-null object dtypes: object(1) memory usage: 68.0+ KB None ------- Data Counts: ------- TRAPPIST-1e 5915 55 Cancri e 1800 PSO J318.5-22 796 Name: Destination, dtype: int64 ------- Null Check: ------- 182 ------- Describe ------- count 8511 unique 3 top TRAPPIST-1e freq 5915 Name: Destination, dtype: object ------- Bar Plot Destination vs Transported: -------
------- Pie Chart Destination Count plot: -------
0 TRAPPIST-1e
1 TRAPPIST-1e
2 TRAPPIST-1e
3 TRAPPIST-1e
4 TRAPPIST-1e
...
8688 55 Cancri e
8689 PSO J318.5-22
8690 TRAPPIST-1e
8691 55 Cancri e
8692 TRAPPIST-1e
Name: Destination, Length: 8693, dtype: object
explore_data("Age", chart=["hist", "box"])
------- Column Info: -------
<class 'pandas.core.series.Series'>
RangeIndex: 8693 entries, 0 to 8692
Series name: Age
Non-Null Count Dtype
-------------- -----
8514 non-null float64
dtypes: float64(1)
memory usage: 68.0 KB
None
------- Data Counts: -------
24.0 324
18.0 320
21.0 311
19.0 293
23.0 292
...
72.0 4
78.0 3
79.0 3
76.0 2
77.0 2
Name: Age, Length: 80, dtype: int64
------- Null Check: -------
179
------- Describe -------
count 8514.000000
mean 28.827930
std 14.489021
min 0.000000
25% 19.000000
50% 27.000000
75% 38.000000
max 79.000000
Name: Age, dtype: float64
------- Hist Chart Age : -------
0 39.0
1 24.0
2 58.0
3 33.0
4 16.0
...
8688 41.0
8689 18.0
8690 26.0
8691 32.0
8692 44.0
Name: Age, Length: 8693, dtype: float64
explore_data("VIP", chart=["bar", "pie"])
------- Column Info: ------- <class 'pandas.core.series.Series'> RangeIndex: 8693 entries, 0 to 8692 Series name: VIP Non-Null Count Dtype -------------- ----- 8490 non-null object dtypes: object(1) memory usage: 68.0+ KB None ------- Data Counts: ------- False 8291 True 199 Name: VIP, dtype: int64 ------- Null Check: ------- 203 ------- Describe ------- count 8490 unique 2 top False freq 8291 Name: VIP, dtype: object ------- Bar Plot VIP vs Transported: -------
------- Pie Chart VIP Count plot: -------
0 False
1 False
2 True
3 False
4 False
...
8688 True
8689 False
8690 False
8691 False
8692 False
Name: VIP, Length: 8693, dtype: object
explore_data("RoomService", chart=["box", "hist"])
------- Column Info: -------
<class 'pandas.core.series.Series'>
RangeIndex: 8693 entries, 0 to 8692
Series name: RoomService
Non-Null Count Dtype
-------------- -----
8512 non-null float64
dtypes: float64(1)
memory usage: 68.0 KB
None
------- Data Counts: -------
0.0 5577
1.0 117
2.0 79
3.0 61
4.0 47
...
1612.0 1
2598.0 1
632.0 1
378.0 1
745.0 1
Name: RoomService, Length: 1273, dtype: int64
------- Null Check: -------
181
------- Describe -------
count 8512.000000
mean 224.687617
std 666.717663
min 0.000000
25% 0.000000
50% 0.000000
75% 47.000000
max 14327.000000
Name: RoomService, dtype: float64
------- Hist Chart RoomService : -------
0 0.0
1 109.0
2 43.0
3 0.0
4 303.0
...
8688 0.0
8689 0.0
8690 0.0
8691 0.0
8692 126.0
Name: RoomService, Length: 8693, dtype: float64
explore_data("FoodCourt", chart=["box"])
------- Column Info: -------
<class 'pandas.core.series.Series'>
RangeIndex: 8693 entries, 0 to 8692
Series name: FoodCourt
Non-Null Count Dtype
-------------- -----
8510 non-null float64
dtypes: float64(1)
memory usage: 68.0 KB
None
------- Data Counts: -------
0.0 5456
1.0 116
2.0 75
3.0 53
4.0 53
...
3846.0 1
5193.0 1
312.0 1
827.0 1
4688.0 1
Name: FoodCourt, Length: 1507, dtype: int64
------- Null Check: -------
183
------- Describe -------
count 8510.000000
mean 458.077203
std 1611.489240
min 0.000000
25% 0.000000
50% 0.000000
75% 76.000000
max 29813.000000
Name: FoodCourt, dtype: float64
0 0.0
1 9.0
2 3576.0
3 1283.0
4 70.0
...
8688 6819.0
8689 0.0
8690 0.0
8691 1049.0
8692 4688.0
Name: FoodCourt, Length: 8693, dtype: float64
explore_data("ShoppingMall", chart=["box"])
------- Column Info: -------
<class 'pandas.core.series.Series'>
RangeIndex: 8693 entries, 0 to 8692
Series name: ShoppingMall
Non-Null Count Dtype
-------------- -----
8485 non-null float64
dtypes: float64(1)
memory usage: 68.0 KB
None
------- Data Counts: -------
0.0 5587
1.0 153
2.0 80
3.0 59
4.0 45
...
3627.0 1
2074.0 1
871.0 1
742.0 1
1872.0 1
Name: ShoppingMall, Length: 1115, dtype: int64
------- Null Check: -------
208
------- Describe -------
count 8485.000000
mean 173.729169
std 604.696458
min 0.000000
25% 0.000000
50% 0.000000
75% 27.000000
max 23492.000000
Name: ShoppingMall, dtype: float64
0 0.0
1 25.0
2 0.0
3 371.0
4 151.0
...
8688 0.0
8689 0.0
8690 1872.0
8691 0.0
8692 0.0
Name: ShoppingMall, Length: 8693, dtype: float64
explore_data("Spa", chart=["box"])
------- Column Info: -------
<class 'pandas.core.series.Series'>
RangeIndex: 8693 entries, 0 to 8692
Series name: Spa
Non-Null Count Dtype
-------------- -----
8510 non-null float64
dtypes: float64(1)
memory usage: 68.0 KB
None
------- Data Counts: -------
0.0 5324
1.0 146
2.0 105
5.0 53
3.0 53
...
273.0 1
2581.0 1
2948.0 1
3778.0 1
1643.0 1
Name: Spa, Length: 1327, dtype: int64
------- Null Check: -------
183
------- Describe -------
count 8510.000000
mean 311.138778
std 1136.705535
min 0.000000
25% 0.000000
50% 0.000000
75% 59.000000
max 22408.000000
Name: Spa, dtype: float64
0 0.0
1 549.0
2 6715.0
3 3329.0
4 565.0
...
8688 1643.0
8689 0.0
8690 1.0
8691 353.0
8692 0.0
Name: Spa, Length: 8693, dtype: float64
explore_data("VRDeck", chart=["box", "hist"])
------- Column Info: -------
<class 'pandas.core.series.Series'>
RangeIndex: 8693 entries, 0 to 8692
Series name: VRDeck
Non-Null Count Dtype
-------------- -----
8505 non-null float64
dtypes: float64(1)
memory usage: 68.0 KB
None
------- Data Counts: -------
0.0 5495
1.0 139
2.0 70
3.0 56
5.0 51
...
408.0 1
876.0 1
2891.0 1
2102.0 1
3235.0 1
Name: VRDeck, Length: 1306, dtype: int64
------- Null Check: -------
188
------- Describe -------
count 8505.000000
mean 304.854791
std 1145.717189
min 0.000000
25% 0.000000
50% 0.000000
75% 46.000000
max 24133.000000
Name: VRDeck, dtype: float64
------- Hist Chart VRDeck : -------
0 0.0
1 44.0
2 49.0
3 193.0
4 2.0
...
8688 74.0
8689 0.0
8690 0.0
8691 3235.0
8692 12.0
Name: VRDeck, Length: 8693, dtype: float64
explore_data("Name")
------- Column Info: -------
<class 'pandas.core.series.Series'>
RangeIndex: 8693 entries, 0 to 8692
Series name: Name
Non-Null Count Dtype
-------------- -----
8493 non-null object
dtypes: object(1)
memory usage: 68.0+ KB
None
------- Data Counts: -------
Gollux Reedall 2
Elaney Webstephrey 2
Grake Porki 2
Sus Coolez 2
Apix Wala 2
..
Jamela Griffy 1
Hardy Griffy 1
Salley Mckinn 1
Mall Frasp 1
Propsh Hontichre 1
Name: Name, Length: 8473, dtype: int64
------- Null Check: -------
200
------- Describe -------
count 8493
unique 8473
top Gollux Reedall
freq 2
Name: Name, dtype: object
0 Maham Ofracculy
1 Juanna Vines
2 Altark Susent
3 Solam Susent
4 Willy Santantines
...
8688 Gravior Noxnuther
8689 Kurta Mondalley
8690 Fayey Connon
8691 Celeon Hontichre
8692 Propsh Hontichre
Name: Name, Length: 8693, dtype: object